{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"corpus_metadata.ipynb","provenance":[],"collapsed_sections":[],"toc_visible":true,"mount_file_id":"1vQamJ9wh7IBaeaMBYC8BwfzK7U_Y-OSP","authorship_tag":"ABX9TyMDAgFwJ9MNOHvslvawTI/i"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"markdown","metadata":{"id":"SMe5aaQianRm"},"source":["# Stage 1. Corpus Creation\n","\n","\n"]},{"cell_type":"markdown","metadata":{"id":"_gd9gfYsfbNB"},"source":["## 1. Assembling metadata\n","\n","To retrieve metadata on the winning books, I used an online tool to scrape tables from the Wikipedia pages of the prizes and turn them in csv files. Then, I assembled them together using the following code in PyCharm. I made sure there were no redundant information – since often times some of the nominees of the Nebula and the Hugo overlap – and I created a CSV files with all the books’ metadata. The corpus now counted 726 entries."]},{"cell_type":"code","metadata":{"id":"RqbmzPnI9PsL"},"source":["import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import seaborn as sns\n","import csv"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"71juoNx6aije"},"source":["def finalmeta_extraction(file_path):\n","\n","    with open(file_path, 'r', encoding='utf-8') as csvfile:  \n","        reader = csv.DictReader(csvfile)  \n","        finalmeta_dict = dict() \n","        for row in reader:  \n","            if row[\"date\"] > \"1899\" and \"scifi\" in row[\"tags\"]:\n","                dictid = row[\"title\"].lower()\n","                finalmeta_dict[dictid] = dict()\n","                finalmeta_dict[dictid][\"htid\"] = row[\"docid\"]\n","                finalmeta_dict[dictid][\"author\"] = row[\"author\"]\n","                finalmeta_dict[dictid][\"date\"] = row[\"date\"]\n","                finalmeta_dict[dictid][\"title\"] = row[\"title\"]\n","\n","        return finalmeta_dict\n","\n","\n","def hugo_extraction(file_path):\n","\n","    with open(file_path, 'r', encoding='utf-8') as csvfile:  \n","        reader = csv.DictReader(csvfile)  \n","        hugo_dict = dict()  \n","        for row in reader:  \n","            year = str(int(row[\"Year\"]) - 1)\n","            title = title_polisher(row[\"Novel\"])\n","            dictid = title.lower()\n","            name = name_polisher(row[\"Author(s)\"])\n","            if dictid in hugo_dict.keys():\n","                hugo_dict[dictid][\"author\"] = hugo_dict[dictid][\"author\"] + \", \" + name\n","            else:\n","                hugo_dict[dictid] = dict()\n","                hugo_dict[dictid][\"htid\"] = \" \"\n","                hugo_dict[dictid][\"author\"] = name\n","                hugo_dict[dictid][\"date\"] = year\n","                hugo_dict[dictid][\"title\"] = row[\"Novel\"]\n","\n","        return hugo_dict\n","\n","\n","def nebula_extraction(file_path):\n","\n","    with open(file_path, 'r', encoding='utf-8') as csvfile:  \n","        reader = csv.DictReader(csvfile)  \n","        nebula_dict = dict()  \n","        for row in reader:  \n","            year = str(int(row[\"Year\"]) - 1)\n","            name = name_polisher(row[\"Author\"])\n","            dictid = row[\"Novel\"].lower()\n","            if dictid in nebula_dict.keys():\n","                nebula_dict[dictid][\"author\"] = nebula_dict[dictid][\"author\"] + \", \" + name\n","            else:\n","                nebula_dict[dictid] = dict()\n","                nebula_dict[dictid][\"htid\"] = \" \"\n","                nebula_dict[dictid][\"author\"] = name\n","                nebula_dict[dictid][\"date\"] = year\n","                nebula_dict[dictid][\"title\"] = row[\"Novel\"]\n","\n","        return nebula_dict\n","\n","\n","def name_polisher(name):\n","    name_wo_stars = name.replace(\"*\", \"\")\n","    name_split = name_wo_stars.split(\" \")\n","    if name_split[len(name_split)-2].lower() == \"van\" or name_split[len(name_split)-2].lower() == \"le\":\n","        last_name = name_split[len(name_split) - 2] + \" \" + name_split[len(name_split) - 1]\n","        name_split.insert(0, last_name)\n","        name_split.pop()\n","        name_split.pop()\n","    else:\n","        last_name = name_split[len(name_split)-1] + \",\"\n","        name_split.insert(0, last_name)\n","        name_split.pop()\n","\n","    polished_name = \" \".join(name_split)\n","\n","    return polished_name\n","\n","def title_polisher(title):\n","    title_split = title.split(\" (\")\n","    new_title = title_split[0]\n","\n","    return new_title"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"ffyduFzGgM9n"},"source":["def metadata_dict_creation(path1, path2, path3, path4):\n","    metadata_dict1 = dict()\n","    metadata_dict1.update(nebula_extraction(path3))\n","    metadata_dict1.update(hugo_extraction(path1))\n","    metadata_dict1.update(hugo_extraction(path2))\n","    metadata_dict1.update(finalmeta_extraction(path4))\n","\n","    metadata_dict2 = dict()\n","\n","    for value in metadata_dict1.values():\n","        dictid = value[\"date\"] + \"_\" + value[\"title\"]\n","        metadata_dict2[dictid] = dict()\n","        metadata_dict2[dictid].update(value)\n","\n","    sorted_metadata_dict = dict(sorted(metadata_dict2.items()))\n","\n","    return sorted_metadata_dict\n","\n","\n","def metadata_csv_creation(dictionary):\n","\n","    with open('metadata.csv', 'w', newline=\"\") as csvfile:\n","        fieldnames = [\"htid\", \"author\", \"date\", \"title\"]\n","        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)\n","        writer.writeheader()\n","        for i in dictionary.values():\n","            writer.writerow(i)\n","\n","\n","dictio = metadata_dict_creation(\"hugo.csv\", \"retrohugo.csv\", \"nebula.csv\", \"finalmeta.csv\")\n","\n","#print(metadata_csv_creation(dictio))"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"Q4lS03NYh2ht"},"source":["data = pd.read_csv(\"metadata.csv\")\n","#data.drop(columns=\"Unnamed: 0\", inplace=True)\n","data.set_index(\"htid\", inplace=True)\n","\n","data.head()"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"gjLI_P5n-sOD"},"source":["## 2. HTIDs retrieval\n","\n","Matthew Wilkens wrote an algorithm to retrieve the books' identifiers from the HathiTrust collection querying the available metadata.\n","\n","Then, 5% of the obtained identifiers were manually checked to evaluate whether the returned HTIDs were correct and whether a considerable amount of identifiers were missed.\n","\n","Since very few title were incorrect or had been missed, we proceeded to form the corpus with the obtained 330 titles."]},{"cell_type":"markdown","metadata":{"id":"QG6AkywQgbcf"},"source":["## 3. Metadata with htids"]},{"cell_type":"code","metadata":{"id":"FHQaXw1hiIbZ","colab":{"base_uri":"https://localhost:8080/","height":204},"executionInfo":{"status":"ok","timestamp":1596796963552,"user_tz":-120,"elapsed":1330,"user":{"displayName":"Federica Bologna","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhgEt2iQ-lqzChBHQtfeAS5RKgr8pTpJ6Iaco03=s64","userId":"07332947698373575453"}},"outputId":"807b3df8-fd9e-442b-bbbe-f6faa0a744c5"},"source":["data = pd.read_csv(\"/content/drive/My Drive/Università/3 ANNO MAGISTRALE/TESI/1_corpus/metadata_with_htids.csv\", encoding=\"utf-8\")\n","data.drop(columns=\"Unnamed: 0\", inplace=True)\n","data[\"decade\"] = data[\"date\"]//10*10\n","data.head()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>htid</th>\n","      <th>title</th>\n","      <th>author</th>\n","      <th>title_htrc</th>\n","      <th>date</th>\n","      <th>pub_date_htrc</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>nyp.33433076024060</td>\n","      <td>The secret of the crater</td>\n","      <td>Osborne, Duffield,</td>\n","      <td>NaN</td>\n","      <td>1900</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>inu.30000042750632</td>\n","      <td>A woman of Mars</td>\n","      <td>Ling, M. A. Moore Bentley</td>\n","      <td>NaN</td>\n","      <td>1901</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>njp.32101021206436</td>\n","      <td>The first men in the moon</td>\n","      <td>Wells, H. G.</td>\n","      <td>NaN</td>\n","      <td>1901</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>nyp.33433074954656</td>\n","      <td>Welsh rarebit tales</td>\n","      <td>Owen, Harle Oren.</td>\n","      <td>NaN</td>\n","      <td>1902</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>mdp.39015059642994</td>\n","      <td>Strange case of Dr. Jekyl</td>\n","      <td>Stevenson, Robert Louis,</td>\n","      <td>NaN</td>\n","      <td>1903</td>\n","      <td>NaN</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["                 htid                      title  ...  date pub_date_htrc\n","0  nyp.33433076024060   The secret of the crater  ...  1900           NaN\n","1  inu.30000042750632            A woman of Mars  ...  1901           NaN\n","2  njp.32101021206436  The first men in the moon  ...  1901           NaN\n","3  nyp.33433074954656        Welsh rarebit tales  ...  1902           NaN\n","4  mdp.39015059642994  Strange case of Dr. Jekyl  ...  1903           NaN\n","\n","[5 rows x 6 columns]"]},"metadata":{"tags":[]},"execution_count":3}]},{"cell_type":"code","metadata":{"id":"mTI2inTBkgoa","colab":{"base_uri":"https://localhost:8080/","height":312},"executionInfo":{"status":"ok","timestamp":1582822812850,"user_tz":-60,"elapsed":926,"user":{"displayName":"Federica Bologna","photoUrl":"https://lh3.googleusercontent.com/a-/AAuE7mAbXMmexGk_lRJUtqkV2mkHhg7CLpyfyXZzkkac=s64","userId":"07332947698373575453"}},"outputId":"3e40cab2-4e69-4209-bdcd-d09dc75691e7"},"source":["# Histogram plot for books with/without HTIDs\n","\n","plt.hist(data.date, bins=20, rwidth=0.90, zorder=3);\n","plt.xlim(data.date.min(), data.date.max())\n","plt.ylim(0,70);\n","plt.grid(axis=\"y\", alpha=0.75)\n","plt.xlabel(\"Publication date\")\n","plt.ylabel(\"Number  of  novels\")\n","plt.title(\"Corpus without htids\")\n","#plt.savefig(\"metadata.png\", dpi=300)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Text(0.5, 1.0, 'Corpus without htids')"]},"metadata":{"tags":[]},"execution_count":13},{"output_type":"display_data","data":{"image/png":"iVBORw0KGgoAAAANSUhEUgAAAX4AAAEWCAYAAABhffzLAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAeqElEQVR4nO3deZQdZZ3/8fcHCBDWJCyxASHs/HCh\nkR7AEdlBEIUICAhiWJzoERUPyBCVUVzGiTDKIvLTeBSjMgREEBBFILKpyN6EJcFASMaELOyEKEvg\nO3/U01Bp+nZX3751+3bX53XOPbfqqaqnvnWr+3vrPlX1lCICMzOrjpUGOwAzM2suJ34zs4px4jcz\nqxgnfjOzinHiNzOrGCd+M7OKceI3K0jSQ5L27GX6zZI+2cSQ8uv+maRv9WP+90t6pFH12dDixG8N\nJeloSXdLelHSQkm/l7TbYMfVCBHxjoi4GUDSmZJ+2Yz1StpT0vwB1hGStuoaj4jbImLbgUdnQ5ET\nvzWMpFOAc4FvA2OBTYELgUPqqGuVxkZnZl2c+K0hJK0LfAM4KSKuiIhlEfFqRFwTEaeleVaTdK6k\nJ9LrXEmrpWl7Spov6XRJi4CLcmVflvSUpLmSjsmtc4WmFUnHSfpTGpakcyQtkfSCpAckvbOHuPeS\n9EBu/AZJd+XGb5M0Pg3PlbSvpAOALwNHpl829+eq3EzSnyUtlXS9pPVzdR2cmoueS7H/v9y0FY7I\nu5paJK0J/B7YKK3rRUkb1dgNoyVdm9Z9h6QtU123pun3p+WP7P4rQtKOku5Ny14KrJ6btr6k36a4\nn0mfiXPHEOadZ43yXrJkcWUv83wF2BVoB3YAdgbOyE1/GzAG2AyYmCtbH9gYmABMkVSkiWJ/YHdg\nG2Bd4Ajg6R7m+yuwdUpuI4B3kyXZtSWNBDqA2/ILRMR1ZL9qLo2ItSJih9zko4HjgQ2BVYEvAkja\nBrgE+AKwAfA74BpJq/a2ERGxDDgQeCKta62IeKLG7EcBXwdGA48C/5nq2D1N3yEtf2l+oRTDb4Bf\nkH3+vwIOy81yKjA/xT2W7EvPfb0MYU781ijrAU9FxPJe5jkG+EZELImIJ8mS1LG56a8DX4uIlyPi\nn7ny/0hltwDXkiXxvrwKrA1sBygiZkbEwu4zpfXcRfYlsRNwP/Bn4H1kX1KzI6KnL4xaLoqIv6V6\nLyP7kgM4Erg2Im6IiFeB/wZGAv/aj7r7cmVE3Jn2wcW5dfdlV2AEcG76lXY52WfS5VWgDdgsTb8t\n3MnXkObEb43yNLB+H23zGwHzcuPzUlmXJyPipW7LPJuOemst06OI+CNwAfADYImkKZLWqTH7LcCe\nZMn/FuBmYI/0uqWvdXWzKDf8D2CtNLzCtkfE68DfyX7JNEqtdfdlI2BBt2Se309nk/2CuF7SHEmT\nBhamDTYnfmuU24GXgfG9zPMEWTNOl01TWZeejiJHp3bunpZZBqyRm/a2/IIRcX5E7ARsT9bkc1qN\nuLon/lvoO/H394h3hW2XJODtwIJU9A9qb0vZR9cLgY1TTF02fWPlEUsj4tSI2AI4GDhF0j4lx2Ql\ncuK3hoiI54GvAj+QNF7SGpJGSDpQ0llptkuAMyRtkE56fhUocknk1yWtKun9wIfI2qABOoFD07q2\nAk7sWkDSv0jaJbXbLwNeImtK6slfgG3JzjncGREPkSXpXYBbayyzGBjXj5OclwEHSdonxXQq2Rfl\nX3LbcrSkldPJ4z26rWu9dAK9XouBLWpMux1YDnw+7bNDyT4LACR9SNJW6YvheeA1an+WNgQ48VvD\nRMR3gVPITtg+SdaU8VmyE4cA3wLuBmYADwD3prLeLAKeJTtivhj4dETMStPOAV4hS2pT0/Qu6wA/\nTsvOI2uKOrtG3MtSLA9FxCup+HZgXkQsqRFX15fP05Lu7WMbiIhHgI8D3weeAj4MfDi3vpNT2XNk\n50J+k1t2FtmX5px0ZU2fTV09OBOYmpZf4RxJiuFQ4DjgGbLzEVfkZtkauBF4kexzuTAibqojBmsR\n8jkaa1XK7pL9ZURsMtixmA0nPuI3M6uY0hK/pG0ldeZeL0j6gqQx6SaZ2el9dFkxmJnZWzWlqUfS\nymRXL+wCnAQ8ExGT02VhoyPi9NKDMDMzoHlNPfsAj0XEPLJ+W6am8qn0fvmfmZk1WLM6wjqK7KoE\ngLG5OygXkd0C/haSJpJu2x85cuRO48aNKztGM7NhZebMmU9FxAbdy0tv6kn9gDwBvCMiFkt6LiJG\n5aY/GxG9tvN3dHTE3XffXWqcZmbDjaR7IqKje3kzmnoOBO6NiMVpfLGkthRUG1DrOmkzMytBMxL/\nx3izmQfgarJeFknvVzUhBjMzS0pN/KmPlf1Y8S7AycB+kmYD+6ZxMzNrklJP7qZb4dfrVvY02VU+\nZmY2CHznrplZxTjxm5lVjBO/mVnFOPGbmVVMs+7cNbMWN27StXUvO3fyQQ2MxMrmI34zs4px4jcz\nqxgnfjOzinHiNzOrGCd+M7OK8VU9ZtZSfHVR+XzEb2ZWMU78ZmYV46Yes2HCTSSNM9w/Sx/xm5lV\njBO/mVnFOPGbmVWME7+ZWcU48ZuZVYyv6jGzhhnuV8MMFz7iNzOrGCd+M7OKKTXxSxol6XJJsyTN\nlPReSWMk3SBpdnofXWYMZma2orKP+M8DrouI7YAdgJnAJGB6RGwNTE/jZmbWJKUlfknrArsDPwGI\niFci4jngEGBqmm0qML6sGMzM7K3KvKpnc+BJ4CJJOwD3ACcDYyNiYZpnETC2p4UlTQQmArS1tdHZ\n2VliqGbVNtD/r0b8f7ZKHcMhhr6UmfhXAd4DfC4i7pB0Ht2adSIiJEVPC0fEFGAKQEdHR7S3t5cY\nqtkwMG1B3Yu2t7cPfPlGxNCoOgaqFWIoUZlt/POB+RFxRxq/nOyLYLGkNoD0vqTEGMzMrJvSEn9E\nLAL+LmnbVLQP8DBwNTAhlU0AriorBjMze6uy79z9HHCxpFWBOcDxZF82l0k6EZgHHFFyDGYtz3e8\nNpY/z96VmvgjohPo6GHSPmWu18zMavOdu2ZmFePEb2ZWMU78ZmYV48RvZlYx7o/fzKxFlXV1ko/4\nzcwqxonfzKxinPjNzCrGid/MrGKc+M3MKsaJ38ysYpz4zcwqxonfzKxifAOXmVkJWrlraB/xm5lV\njBO/mVnFOPGbmVWME7+ZWcU48ZuZVYwTv5lZxTjxm5lVjBO/mVnFlHoDl6S5wFLgNWB5RHRIGgNc\nCowD5gJHRMSzZcZhZmZvasYR/14R0R4RHWl8EjA9IrYGpqdxMzNrksFo6jkEmJqGpwLjByEGM7PK\nKruvngCulxTAjyJiCjA2Iham6YuAsT0tKGkiMBGgra2Nzs7OkkM1G5oa8b8x0DpaIYZWqaMVYuir\njn4lfkkrAWtFxAsFF9ktIhZI2hC4QdKs/MSIiPSl8BbpS2IKQEdHR7S3t/cnVLOhZdqCuhd9439j\noHW0QgytUkcrxNCoOnrQZ1OPpP+RtI6kNYEHgYclnVZkxRGxIL0vAa4EdgYWS2pLdbcBS4rUZWZm\njVGkjX/7dIQ/Hvg9sDlwbF8LSVpT0tpdw8D+ZF8cVwMT0mwTgKvqiNvMzOpUpKlnhKQRZIn/goh4\ntVbzTDdjgSslda3nfyLiOkl3AZdJOhGYBxxRZ+xmZlaHIon/R2TX298P3CppM6DPNv6ImAPs0EP5\n08A+/QvTzMwapc/EHxHnA+fniuZJ2qu8kMzMrEw1E7+kU/pY9nsNjsXMzJqgtyP+tZsWhZmZNU3N\nxB8RX29mIGZm1hxFruPfRtJ0SQ+m8XdLOqP80MzMrAxFruP/MfAl4FWAiJgBHFVmUGZmVp4iiX+N\niLizW9nyMoIxM7PyFUn8T0nakqzDNSQdDizsfREzM2tVRW7gOomss7TtJC0AHgeOKTUqMzMrTZHE\nPy8i9k397awUEUvLDsrMzMpTpKnncUlTgF2BF0uOx8zMSlYk8W8H3EjW5PO4pAsk7VZuWGZmVpY+\nE39E/CMiLouIQ4EdgXWAW0qPzMzMSlHombuS9pB0IXAPsDruStnMbMjq8+SupLnAfcBlwGkRsazs\noMzMrDxFrup5dz+esWtmZi2uSFPPOpKulLQkvX4taZPSIzMzs1IUSfwXkT0nd6P0uiaVmZnZEFQk\n8W8QERdFxPL0+hmwQclxmZlZSYok/qclfVzSyun1ceDpsgMzM7NyFEn8J5BdvrmIrHO2w4HjywzK\nzMzKU+Rh6/OAg5sQi5mZNUGR6/g3AP4NGJefPyJOKLICSSsDdwMLIuJDkjYHpgHrkd0QdmxEvNL/\n0M3MrB5FmnquAtYl66/n2tyrqJOBmbnx7wDnRMRWwLPAif2oy8zMBqjIDVxrRMTp9VServc/CPhP\n4BRJAvYGjk6zTAXOBP5/PfWbmVn/FTni/62kD9ZZ/7nAvwOvp/H1gOciouvRjfOBjeus28zM6lDk\niP9k4MuSXiZ74LqAiIh1eltI0oeAJRFxj6Q9+xuYpInARIC2tjY6Ozv7W4VZJTTif2OgdbRCDK1S\nRyvE0FcdRa7qWbvO9b4PODj9WlidrDvn84BRklZJR/2bAAtqrHcK2SMf6ejoiPb29jrDMBsCpvX4\nb1DIG/8bA62jFWJolTpaIYZG1dGDQt0y1yMivhQRm0TEOOAo4I8RcQxwE9m9AAATyE4em5lZk5SW\n+HtxOtmJ3kfJ2vx/MggxmJlVVpE2/gGLiJuBm9PwHGDnZqzXzMzeajCO+M3MbBA15YjfbDgbN6k/\n9zOuaO7kgxoYiVkxNY/4Ja3WzEDMzKw5emvquR1A0i+aFIuZmTVBb009q0o6GvhXSYd2nxgRV5QX\nlpmZlaW3xP9p4BhgFPDhbtMCcOI3MxuCaib+iPgT8CdJd0eEr7U3MxsmilzV8wtJnwd2T+O3AD+M\niFfLC8vMzMpSJPFfCIxI7wDHknWj/MmygjIzs/IUSfz/EhE75Mb/KOn+sgIyM7NyFblz9zVJW3aN\nSNoCeK28kMzMrExFjvhPA26SNIesL/7NgONLjcrMzEpTpD/+6ZK2BrZNRY9ExMvlhmVmZmUp1FdP\nSvQzSo7FzMyawL1zmplVjBO/mVnFuFtmG9LcJbJZ//mI38ysYpz4zcwqptfEL2llSbOaFYyZmZWv\n18QfEa8Bj0jatEnxmJlZyYqc3B0NPCTpTmBZV2FEHFxaVGZmVpoiif8/6qlY0urArcBqaT2XR8TX\nJG0OTAPWA+4Bjo2IV+pZh5mZ9V+fJ3cj4hZgLjAiDd8F3Fug7peBvVPPnu3AAZJ2Bb4DnBMRWwHP\nAifWGbuZmdWhz8Qv6d+Ay4EfpaKNgd/0tVxkXkyjI9IrgL1TfQBTgfH9jNnMzAagyOWcJwHvA14A\niIjZwIZFKk9XBXUCS4AbgMeA5yJieZplPtkXiZmZNUmRNv6XI+IVSQBIWoXsyL1P6aqgdkmjgCuB\n7YoGJmkiMBGgra2Nzs7OoouaFdIKf1ONiKEV6miFGFqljlaIoa86iiT+WyR9GRgpaT/gM8A1/Qkg\nIp6TdBPwXmCUpFXSUf8mwIIay0wBpgB0dHREe3t7f1ZpVTGtxz+fQhr2N9WIGFqhjlaIoVXqaIUY\nGlVHD4o09UwCngQeAD4F/A44o6+FJG2QjvSRNBLYD5gJ3AQcnmabAFxVIAYzM2uQIg9ieV3SVOAO\nsiaeRyKiSFNPGzBV0spkXzCXRcRvJT0MTJP0LeA+4Cf1h29mZv3VZ+KXdBDwQ7ITswI2l/SpiPh9\nb8tFxAxgxx7K5wA71xeumZkNVJE2/u8Ce0XEowDpwevXAr0mfjMza01F2viXdiX9ZA6wtKR4zMys\nZDWP+CUdmgbvlvQ74DKyNv6Pkt29a2ZmQ1BvTT0fzg0vBvZIw08CI0uLyMzMSlUz8UfE8c0MxKrH\nj000GxxFrurZHPgcMC4/v7tlNjMbmopc1fMbsmvtrwFeLzccMzMrW5HE/1JEnF96JGZm1hRFEv95\nkr4GXE/Wxz4AEVGkT34zM2sxRRL/u4BjyfrR72rq6epX38zMhpgiif+jwBZ+PKKZ2fBQ5M7dB4FR\nZQdiZmbNUeSIfxQwS9JdrNjG78s5zcyGoCKJ/2ulR2FmZk1TpD/+W5oRiJmZNUeRO3eX8uYzdlcF\nRgDLImKdMgMzM7NyFDniX7trWNkT1w8Bdi0zKDMzK0+Rq3reEJnfAB8oKR4zMytZkaaeQ3OjKwEd\nwEulRWRmZqUqclVPvl/+5cBcsuYeMzMbgoq08btffjOzYaS3Ry9+tZflIiK+WUI8ZmZWst6O+Jf1\nULYmcCKwHuDEb2Y2BPX26MXvdg1LWhs4GTgemAZ8t9ZyuWXeDvwcGEt2H8CUiDhP0hjgUrInes0F\njoiIZ+vfBDMz649eL+eUNEbSt4AZZF8S74mI0yNiSYG6lwOnRsT2ZNf9nyRpe2ASMD0itgamp3Ez\nM2uSmolf0tnAXcBS4F0RcWZ/jswjYmHXw1oiYikwE9iY7IqgqWm2qcD4OmM3M7M69NbGfypZb5xn\nAF/JbtoFQGQndwt32SBpHLAjcAcwNiIWpkmLyJqCelpmIjARoK2tjc7OzqKrswpoxN9DK/xNtcp2\nDLSOVoihVepohRj6qqO3Nv5+3dVbi6S1gF8DX4iIF3JfIERESIqelouIKcAUgI6Ojmhvb29EONZK\npi2oe9E3/h4aUcdAtcp2DLSOVoihVepohRgaVUcPGpLca5E0gizpXxwRV6TixZLa0vQ2oMj5AjMz\na5Aid+7WJXXo9hNgZkR8LzfpamACMDm9X1VWDGZ9GTfp2rqXnTv5oAZGYtY8pSV+4H1kD2l/QFJX\nY9OXyRL+ZZJOBOYBR5QYg5mZdVNa4o+IP5GdCO7JPmWt18zMeldqG7+ZmbUeJ34zs4px4jczqxgn\nfjOzinHiNzOrGCd+M7OKceI3M6sYJ34zs4px4jczq5gyu2ywYcx93JgNXT7iNzOrGCd+M7OKceI3\nM6sYJ34zs4px4jczqxgnfjOzinHiNzOrGCd+M7OKceI3M6sY37lbQb7r1qzafMRvZlYxTvxmZhVT\nWuKX9FNJSyQ9mCsbI+kGSbPT++iy1m9mZj0r84j/Z8AB3comAdMjYmtgeho3M7MmKi3xR8StwDPd\nig8BpqbhqcD4stZvZmY9a/ZVPWMjYmEaXgSMrTWjpInARIC2tjY6OzubEJ71pRH7YbjU0QoxtEod\nrRBDq9TRCjH0VcegXc4ZESEpepk+BZgC0NHREe3t7U2LbdibtqDuRd/YD8OljlaIoVXqaIUYWqWO\nVoihUXX0oNlX9SyW1AaQ3pc0ef1mZpXX7MR/NTAhDU8Armry+s3MKq/MyzkvAW4HtpU0X9KJwGRg\nP0mzgX3TuJmZNVFpbfwR8bEak/Ypa51mZta3IdNXj/uXeZM/CzMbCHfZYGZWMU78ZmYVM2SaeoYL\nN9OY2WDzEb+ZWcU48ZuZVYwTv5lZxTjxm5lVjBO/mVnFOPGbmVWME7+ZWcU48ZuZVUxlbuBqxI1T\nvvnKzIYDH/GbmVWME7+ZWcU48ZuZVYwTv5lZxTjxm5lVjBO/mVnFOPGbmVWME7+ZWcU48ZuZVcyg\nJH5JB0h6RNKjkiYNRgxmZlXV9MQvaWXgB8CBwPbAxyRt3+w4zMyqajCO+HcGHo2IORHxCjANOGQQ\n4jAzqyRFRHNXKB0OHBARn0zjxwK7RMRnu803EZiYRt8JPNjUQJtvfeCpwQ6iRMN9+8DbOFwMp23c\nLCI26F7Ysr1zRsQUYAqApLsjomOQQyrVcN/G4b594G0cLqqwjYPR1LMAeHtufJNUZmZmTTAYif8u\nYGtJm0taFTgKuHoQ4jAzq6SmN/VExHJJnwX+AKwM/DQiHupjsSnlRzbohvs2DvftA2/jcDHst7Hp\nJ3fNzGxw+c5dM7OKceI3M6uYweqy4aeSlkh6MFe2g6TbJT0g6RpJ6+SmfSl17/CIpA/kylu264f+\nbKOk/STdk8rvkbR3bpmdUvmjks6XpMHYnp70dz+m6ZtKelHSF3NlLbkf6/g7fXea9lCavnoqHxb7\nUNIISVNT+UxJX8ot05L7EEDS2yXdJOnhtG9OTuVjJN0gaXZ6H53KlfbTo5JmSHpPrq4Jaf7ZkiYM\n1jYNWEQ0/QXsDrwHeDBXdhewRxo+AfhmGt4euB9YDdgceIzspPDKaXgLYNU0z/aDsT0N2MYdgY3S\n8DuBBbll7gR2BQT8HjhwsLetnm3MTb8c+BXwxTTesvuxn/twFWAGsEMaXw9YeTjtQ+BoYFoaXgOY\nC4xr5X2YYm0D3pOG1wb+lvLKWcCkVD4J+E4a/mDaT0r77Y5UPgaYk95Hp+HRg7199bwG5Yg/Im4F\nnulWvA1waxq+ATgsDR9C9sf2ckQ8DjxK1u1DS3f90J9tjIj7IuKJVP4QMFLSapLagHUi4q+R/eX9\nHBhffvTF9HM/Imk88DjZNnZp2f3Yz+3bH5gREfenZZ+OiNeG2T4MYE1JqwAjgVeAF2jhfQgQEQsj\n4t40vBSYCWxMFuPUNNtU3twvhwA/j8xfgVFpP34AuCEinomIZ8k+mwOauCkN00pt/A/x5h/LR3nz\nJq+Ngb/n5pufymqVt7Ja25h3GHBvRLxMtj3zc9OG7DZKWgs4Hfh6t/mH2n6stQ+3AULSHyTdK+nf\nU/mw2Ydkv9aWAQuB/wX+OyKeYQjtQ0njyH5h3wGMjYiFadIiYGwaHk45p0etlPhPAD4j6R6yn2Ov\nDHI8Zeh1GyW9A/gO8KlBiK1Ram3jmcA5EfHiYAXWILW2bxVgN+CY9P4RSfsMTogDVmsbdwZeAzYi\na3Y9VdIWgxNi/6WDj18DX4iIF/LT0q+xylzb3jJ99UTELLKfy0jaBjgoTeqti4ch1fVDL9uIpE2A\nK4FPRMRjqXgB2XZ1GcrbuAtwuKSzgFHA65JeAu5hCO3HXrZvPnBrRDyVpv2OrO38lwyffXg0cF1E\nvAoskfRnoIPsKLil96GkEWRJ/+KIuCIVL5bUFhELU1POklReK+csAPbsVn5zmXGXpWWO+CVtmN5X\nAs4AfpgmXQ0cldq8Nwe2JjtZNuS6fqi1jZJGAdeSnWj6c9f86WfoC5J2TVeCfAK4qumB90OtbYyI\n90fEuIgYB5wLfDsiLmCI7cde/k7/ALxL0hqpDXwP4OHhtA/Jmnf2TtPWJDvxOYsW34fpc/8JMDMi\nvpebdDXQdWXOBN7cL1cDn0hX9+wKPJ/24x+A/SWNTlcA7Z/Khp7BOKMMXELWTvgq2ZHSicDJZGfb\n/wZMJt1VnOb/CtlVA4+QuyKC7Oz739K0rwz2mfJ6t5Hsn2sZ0Jl7bZimdZB1Sf0YcEH+cxnsV3/3\nY265M0lX9bTyfqzj7/TjZO3jDwJn5cqHxT4E1iK7Iush4GHgtFbfhym23ciacWbk/r8+SHbl1XRg\nNnAjMCbNL7KHRT0GPAB05Oo6gewCk0eB4wd72+p9ucsGM7OKaZmmHjMzaw4nfjOzinHiNzOrGCd+\nM7OKceI3M6sYJ35reZJek9Qp6UFJv5K0Rh/z3yzpLQ/LlnScpAvS8KclfaKOWEZJ+kxufCNJl/e3\nngLr2VPSb/uYp13SBxu9bhv+nPhtKPhnRLRHxDvJug/49EArjIgfRsTP61h0FPBG4o+IJyLi8IHG\nU6d2suvRzfrFid+GmtuArSSN04p9yH9R0pm5+Y7N/UrYuXslks5UeiaApK0k3Sjp/tTB2paS1pI0\nPY0/IKmr07LJwJap7rPzcUhaXdJFaf77JO2Vyo+TdIWk61I/7mf1tGHK+rSfJele4NBc+c7K+se/\nT9JfJG2b7pD9BnBkiuVISWsq61//zjRvy/SQaa2lZfrqMetL6grhQOC6ArOvERHtknYHfkr2nINa\nLgYmR8SVyh6eshLZL4uPRMQLktYH/irparJ+298ZEe0ppnG5ek4i6+/rXZK2A65Pfd1AdnS+I/Ay\n8Iik70fEGz09pvX+mKxLhEeBS3P1zgLeHxHLJe1L1t3FYZK+SnZX6WdTHd8G/hgRJ6RuQO6UdGNE\nLCvweVmFOPHbUDBSUmcavo2s35WN+ljmEsj6m5e0TkqEbyFpbWDjiLgyzf9SKh8BfDt9cbxO1v3u\n2J7qyNkN+H6qZ5akeWTdNQNMj4jnU90PA5uxYhe/2wGPR8TsNM8vgYlp2rrAVElbk3U9MKLG+vcH\nDtabTzdbHdiUrP95szc48dtQ8M+uI+wukpazYlPl6t2W6d4XSX/7JjkG2ADYKSJelTS3h3X0x8u5\n4dfo3//eN4GbIuIj6RfGzTXmE3BYRDxST4BWHW7jt6FqMbChpPUkrQZ8qNv0IwEk7UbWu+LzPVUS\n2ROZ5it7OhjKeoFdg+woe0lK+nuRHaEDLCXro74nt5F9YXR1Z7wpWceCRcwCxknaMo1/LDdtXd7s\n5vi4XHn3WP4AfC71RomkHQuu2yrGid+GpMj6hP8GWRfdN5AlzryXJN1H1qXwiX1UdyzweUkzgL8A\nbyNr9++Q9ABZV8qz0nqfBv6cThqf3a2eC4GV0jKXAsdF9iS1ItvzElnTzrXp5O6S3OSzgP9K25P/\npXATsH3XyV2yXwYjgBmSHkrjZm/h3jnNzCrGR/xmZhXjxG9mVjFO/GZmFePEb2ZWMU78ZmYV48Rv\nZlYxTvxmZhXzf3U/n9qwjos2AAAAAElFTkSuQmCC\n","text/plain":["<Figure size 432x288 with 1 Axes>"]},"metadata":{"tags":[]}}]},{"cell_type":"code","metadata":{"id":"tB717l3LlaSg"},"source":["#Code by Matthew Wilkens\n","def validate_htid(s, none_as_good=True):\n","    '''\n","    Check HTID string for validity. \n","    Not rigorous; just contains a period and\n","    is more than five characters long.\n","    Text string 'None' in HTID field indicates \n","    human determination that vol not in Hathi. This is valid.\n","    Returns true if HTID is valid, else False.\n","    '''\n","    if s == 'None':\n","        if none_as_good:\n","            return True\n","        else:\n","            return False\n","    elif s is not None and len(str(s)) > 5 and '.' in str(s):\n","        return True\n","    else:\n","        return False"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"YQUt8PtPl9fU"},"source":["# Code by Matthew Wilkens\n","# Select rows with good HTIDS\n","d = data.loc[data.htid.apply(lambda x: validate_htid(x, none_as_good=False))]"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"MFWfTcJV4uW9","colab":{"base_uri":"https://localhost:8080/","height":375},"executionInfo":{"status":"ok","timestamp":1596805324553,"user_tz":-120,"elapsed":1489,"user":{"displayName":"Federica Bologna","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhgEt2iQ-lqzChBHQtfeAS5RKgr8pTpJ6Iaco03=s64","userId":"07332947698373575453"}},"outputId":"cc0da343-5d2f-4adf-a3ae-b9e475fef5dc"},"source":["xs = d.decade.unique()\n","ys = d.groupby(\"decade\").count()[\"htid\"].values\n","plt.figure(figsize=(9,5))\n","plt.xlim(1894, 2006)\n","plt.ylim(0,80)\n","plt.xlabel('Publication date')\n","plt.xticks(xs)\n","plt.ylabel('Number of novels')\n","\n","rects = plt.bar(xs, ys, 9)\n","for rect in rects:\n","  height = rect.get_height()\n","  plt.annotate('{}'.format(height),\n","                    xy=(rect.get_x() + rect.get_width() / 2, height),\n","                    xytext=(0, 3),  # 3 points vertical offset\n","                    textcoords=\"offset points\",\n","                    ha='center', va='bottom')\n","\n","ax = plt.subplot()\n","ax.grid(axis=\"y\", alpha=0.75)\n","ax.set_axisbelow(True)\n","plt.savefig(\"/content/drive/My Drive/Università/3 ANNO MAGISTRALE/TESI/1_corpus/metadata_htids_labels.png\", dpi=300)"],"execution_count":null,"outputs":[{"output_type":"stream","text":["/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:19: MatplotlibDeprecationWarning: Adding an axes using the same arguments as a previous axes currently reuses the earlier instance.  In a future version, a new instance will always be created and returned.  Meanwhile, this warning can be suppressed, and the future behavior ensured, by passing a unique label to each axes instance.\n"],"name":"stderr"},{"output_type":"display_data","data":{"image/png":"iVBORw0KGgoAAAANSUhEUgAAAiYAAAFBCAYAAABD12Q5AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3dfZzVdZ338ddHyHsU0YVGWB0tVzSMWZnMNiXLUAsSyVv0KlJ2affaq3W7M9q2tLq2pa51rV1dWx6XJXrtivdBdy6GqFkmgQ7mTaxKFiKCmoSSpsDn+uP8wBFnhjPAOed3Zl7Px2Me8zvf393b45mZD9/f9/f9RWYiSZJUBjs1OoAkSdImFiaSJKk0LEwkSVJpWJhIkqTSsDCRJEmlYWEiSZJKo6aFSUR8PCIejIgHIuKaiNg1Ig6KiHsi4tGIuDYidq5lBkmS1DxqVphExHDgb4D2zBwFDADOAr4KXJKZbwaeA6bWKoMkSWoutb6UMxDYLSIGArsDK4H3ADcU62cBp9Q4gyRJahI1K0wycwXwT8BvqBQkvwMWA2syc32x2RPA8FplkCRJzWVgrQ4cEfsAE4GDgDXA9cBJvdh/GjANYLfddhvT2tpag5SSJKneHn744Wcy84+6WlezwgR4L/CrzHwaICJuAt4JDI6IgUWvyQhgRVc7Z+ZMYCZAe3t7Llq0qIZRJUlSvUTEr7tbV8sxJr8Bjo6I3SMigOOBh4AFwGnFNlOAOTXMIEmSmkgtx5jcQ2WQ673AL4pzzQQ+A3wiIh4F9gWuqFUGSZLUXGp5KYfMvBC4cIvmZcBRtTyvJElqTs78KkmSSsPCRJIklYaFiSRJKg0LE0mSVBoWJpIkqTQsTCRJUmlYmEiSpNKwMJEkSaVhYSJJkkrDwkSSJJWGhYkkSSoNCxNJklQaFiaSJKk0LEwkSVJpWJhIkqTSsDCRJEmlYWEiSX3U0qVLaWtr2/y111578fWvf53f/va3jBs3jkMOOYRx48bx3HPPNTqqtJmFiST1UYceeigdHR10dHSwePFidt99dyZNmsSMGTM4/vjjeeSRRzj++OOZMWNGo6NKm1mYSFI/MH/+fN70pjdx4IEHMmfOHKZMmQLAlClT+M53vtPgdNKrLEwkqR+YPXs2kydPBmDVqlW0tLQA8MY3vpFVq1Y1Mpr0GhYmktTHvfzyy8ydO5fTTz/9desigohoQCqpaxYmktTH/fCHP+TII49k2LBhAAwbNoyVK1cCsHLlSoYOHdrIeNJrWJhIUh93zTXXbL6MA3DyyScza9YsAGbNmsXEiRMbFU16ncjMRmfYqvb29ly0aFGjY0hS01m3bh0HHHAAy5YtY++99wbg2Wef5YwzzuA3v/kNBx54INdddx1DhgxpcFL1JxGxODPbu1o3sN5hJEn1s8cee/Dss8++pm3fffdl/vz5DUok9axml3Ii4tCI6Oj0tTYi/jYihkTErRHxSPF9n1plkCRJzaVmhUlmLs3MtsxsA8YAvwduBqYD8zPzEGB+8VqSJKlug1+PBx7LzF8DE4FZRfss4JQ6ZZAkSSVXr8LkLOCaYnlYZq4slp8ChtUpgyRJKrmaD36NiJ2Bk4HPbrkuMzMiurwtKCKmAdMAWlpa6OjoqGlOSSqjU2avaHSErfrOWcMbHUF9SD3uynkfcG9mbprzeFVEtGTmyohoAVZ3tVNmzgRmQuV24ba2tjpElaSSaYLCxN/P2pHqcSlnMq9exgGYC0wplqcAc+qQQZIkNYGaFiYRsQcwDripU/MMYFxEPAK8t3gtSZJU20s5mbkO2HeLtmep3KUjSZL0Gj4rR5IklYaFiSRJKg0LE0mSVBoWJpIkqTQsTCRJUmlYmEiSpNKwMJEkSaVhYSJJkkrDwkSSJJWGhYkkSSoNCxNJklQaFiaSJKk0LEwkSVJpWJhIkqTSsDCRJEmlYWEiSZJKw8JEkiSVhoWJJEkqDQsTSZJUGhYmkiSpNCxMJElSaViYSJKk0rAwkSRJpWFhIkmSSsPCRJIklUZNC5OIGBwRN0TELyPi4Yh4R0QMiYhbI+KR4vs+tcwgSZKaR617TL4B3JKZI4HRwMPAdGB+Zh4CzC9eS5Ik1a4wiYi9gbHAFQCZ+XJmrgEmArOKzWYBp9QqgyRJai617DE5CHga+HZE3BcR/zci9gCGZebKYpungGE1zCBJkprIwBof+0jgY5l5T0R8gy0u22RmRkR2tXNETAOmAbS0tNDR0VHDqJKkbeXvZ+1ItSxMngCeyMx7itc3UClMVkVES2aujIgWYHVXO2fmTGAmQHt7e7a1tdUwqiSV1OwVjU6wVf5+1o5Us0s5mfkUsDwiDi2ajgceAuYCU4q2KcCcWmWQJEnNpZY9JgAfA/4jInYGlgHnUimGrouIqcCvgTNqnEGSJDWJmhYmmdkBtHex6vhanleSJDUnZ36VJEmlYWEiSZJKw8JEkiSVhoWJJEkqDQsTSZJUGhYmkiSpNGo9j4kkSb3S2trKoEGDGDBgAAMHDmTRokV8/vOfZ86cOey0004MHTqUK6+8kv3337/RUVUD9phIkkpnwYIFdHR0sGjRIgA+/elPc//999PR0cGECRP40pe+1OCEqhULE0lS6e21116bl9etW0dENDCNaslLOZKkUokITjjhBCKCj370o0ybNg2Az33uc1x11VXsvffeLFiwoMEpVSv2mEiSSuWuu+7i3nvv5Yc//CGXXXYZd955JwD/8A//wPLlyznnnHO49NJLG5xStWJhIkkqleHDhwMwdOhQJk2axMKFC1+z/pxzzuHGG29sRDTVgYWJJKk01q1bx/PPP795ed68eYwaNYpHHnlk8zZz5sxh5MiRjYqoGnOMiSRVqavbWK+//nouuugiHn74YRYuXEh7e1cPVFe1Vq1axaRJkwBYv349Z599NieddBKnnnoqS5cuZaedduLAAw/km9/8ZoOTqlYsTCSpFxYsWMB+++23+fWoUaO46aab+OhHP9rAVH3HwQcfzJIlS17X7qWb/sPCRJK2w2GHHdboCFKf4hgTSarSpttYx4wZw8yZMxsdR+qT7DGRpCrdddddDB8+nNWrVzNu3DhGjhzJ2LFjGx1L6lPsMZGkKm3tNlZJ288eE0mqwrp169i4cSODBg3afBvrF77whUbHKqXW6d9vdIStenzG+EZHUDe22mMSEedHxF5RcUVE3BsRJ9QjnCSVxapVqzjmmGMYPXo0Rx11FOPHj+ekk07i5ptvZsSIEdx9992MHz+eE088sdFRpaZWTY/JeZn5jYg4EdgH+BBwNTCvpskkqUS6u4110qRJm+fdkLT9qhljsukRju8Hrs7MBzu1SZIk7TDVFCaLI2IelcLkvyJiELCxtrEkSVJ/VM2lnKlAG7AsM38fEfsC59Y2liRJ6o+6LUwi4sgtmg6O6N0VnIh4HHge2ACsz8z2iBgCXAu0Ao8DZ2Tmc706sCRJ6pN66jG5uId1CbynynO8OzOf6fR6OjA/M2dExPTi9WeqPJYkbZey38rqbazq77otTDLz3TU650TguGJ5FnA7FiaSJInq5jHZPSL+PiJmFq8PiYgJVR4/gXkRsTgiphVtwzJzZbH8FDCs16klSVKfVM3g128Di4E/K16vAK4HvlfFvsdk5oqIGArcGhG/7LwyMzMisqsdi0JmGkBLSwsdHR1VnE6Smlsz/q4zs3akagqTN2XmmRExGaC4M6eqUbCZuaL4vjoibgaOAlZFREtmroyIFmB1N/vOBGYCtLe3Z1tbWzWnlKSezV7R6AQ9et3vupLnhT6SWaVRzTwmL0fEblQuyxARbwL+sLWdImKPYs4TImIP4ATgAWAuMKXYbAowZxtyS5KkPqiaHpOLgFuAP46I/wDeCXykiv2GATcXnSsDgf/MzFsi4ufAdRExFfg1cMY25JYkSX3QVguTzJwXEYuBo6lMRX/+Frf/drffMmB0F+3PAsdvQ1ZJktTHbbUwiYjvAv8JzM3MdbWPJEmS+qtqxpj8E3As8FBE3BARp0XErjXOJUmS+qFqLuXcAdwREQOozPb6F8C3gL1qnE2SJPUz1Qx+pbgr5wPAmcCRVGZslSRJ2qGqGWNyHZX5R24BLgXuyMyNtQ4mSZL6n2p6TK4AJmfmhlqHkSRJ/Vs1hcltwF9HxNji9R3ANzPzldrFkiRJ/VE1hcnlwBuAfytef6ho+/NahZIkSf1TNYXJ2zKz80Rpt0XEkloFkiRJ/Vc185hsKJ6PA0BEHAw43kSSJO1w1fSYfBpYEBHLqExJfyBwbk1TSZKkfqmaCdbmR8QhwKFF09LM3OrThSVJknqrqgnWgDFAa7F9W0SQmVfVLJUkSeqXqplg7WrgTUAHr44tScDCRJIk7VDV9Ji0A4dnZtY6jCRJ6t+quSvnAeCNtQ4iSZJUTY/JfsBDEbEQ2DzoNTNPrlkqSZLUL1VTmFxU6xCSJElQ3e3Cd9QjiCRJUjVjTCRJkurCwkSSJJVGt4VJRMwvvn+1fnEkSVJ/1tMYk5aI+DPg5IiYTeU5OZtl5r01TSZJkvqdngqTLwCfB0YA/7zFugTeU6tQkiSpf+q2MMnMG4AbIuLzmfnlOmaSJEn9VDW3C385Ik4GxhZNt2fm96o9QUQMABYBKzJzQkQcBMwG9gUWAx/KzJd7H12SJPU1W70rJyL+ETgfeKj4Oj8ivtKLc5wPPNzp9VeBSzLzzcBzwNReHEuSpFJ56aWXOOqooxg9ejRvectbuPDCCwE49thjaWtro62tjf33359TTjmlwUmbQzUzv44H2jJzI0BEzALuA/5uaztGxIhi/38APhERQWVsytnFJrOozCx7ea+TS5JUArvssgu33XYbe+65J6+88grHHHMM73vf+/jxj3+8eZtTTz2ViRMnNjBl86h2HpPBnZb37sXxvw5cAGwsXu8LrMnM9cXrJ4DhvTieJEmlEhHsueeeALzyyiu88sorVP4dXrF27Vpuu+02e0yqVE2PyT8C90XEAiq3DI8Fpm9tp4iYAKzOzMURcVxvg0XENGAaQEtLCx0dHb09hCQ1nWb8XWdm2LBhA2effTbLly/nzDPPZJdddtl8ju9+97uMGTOGZcuW7dBz9lXVDH69JiJuB95WNH0mM5+q4tjvpDIHyvuBXYG9gG8AgyNiYNFrMgJY0c15ZwIzAdrb27Otra2KU0rSVszu8ldOabzud13J80IfybwDLF26lDVr1jBp0iQGDhzIqFGjAPjsZz/LX/3VX9XknH1RVZdyMnNlZs4tvqopSsjMz2bmiMxsBc4CbsvMc4AFwGnFZlOAOduQW5Kk0hk8eDDvfve7ueWWWwB45plnWLhwIePHj29wsubRiGflfIbKQNhHqYw5uaIBGSRJ2iGefvpp1qxZA8CLL77IrbfeysiRIwG44YYbmDBhArvuumsjIzaVasaYbLfMvB24vVheBhxVj/NKklRrK1euZMqUKWzYsIGNGzdyxhlnMGHCBABmz57N9OlbHZapTnosTIrJ0R7MzJF1yiNJUlN561vfyn333dfluttvv72+YfqAHi/lZOYGYGlEHFCnPJIkqR+r5lLOPsCDEbEQWLepMTNPrlkqSZLUL1VTmHy+5ikkSZKobh6TOyLiQOCQzPxRROwODKh9NEmS1N9stTCJiL+gMgPrEOBNVKaQ/yZwfG2jSZJUH63Tv9/oCD16fEb/mQelmnlM/prKLK5rATLzEWBoLUNJ6tu6exrr1KlTGT16NG9961s57bTTeOGFFxqcVFK9VVOY/CEzX970IiIGAlm7SJL6uk1PY12yZAkdHR3ccsst/OxnP+OSSy5hyZIl3H///RxwwAFceumljY4qqc6qKUzuiIi/A3aLiHHA9cB3axtLUl/W3dNY99prLwAykxdffPE1T2iV1D9UU5hMB54GfgF8FPgB8Pe1DCWp79uwYQNtbW0MHTqUcePG8fa3vx2Ac889lze+8Y388pe/5GMf+1iDU0qqt60WJpm5EZgFfBn4IjArM72UI2m7DBgwgI6ODp544gkWLlzIAw88AMC3v/1tnnzySQ477DCuvfbaBqeUVG9bLUwiYjzwGPAvwKXAoxHxvloHk9Q/bPk0VqgULWeddRY33nhjA5NJaoRqLuVcDLw7M4/LzHcB7wYuqW0sSX1ZV09jPfTQQ3n00UeByhiTuXPnbn5Cq6T+o5qZX5/PzEc7vV4GPF+jPJL6ga6exjp+/HiOPfZY1q5dS2YyevRoLr/88kZHlVRn3RYmEfHBYnFRRPwAuI7KbcKnAz+vQzZJfVR3T2P9yU9+0oA0ksqkpx6TD3RaXgW8q1h+GtitZokkSVK/1W1hkpnn1jOIJElSNc/KOQj4GNDaefvMPLl2sSRJUn9UzeDX7wBXUJntdWNt40iSpP6smsLkpcz8l5onkdR0fCKrpB2tmsLkGxFxITAP+MOmxsy8t2apJElSv1RNYXIE8CHgPbx6KSeL15IkSTtMNYXJ6cDBmflyrcNIkqT+rZop6R8ABtc6iCRJUjU9JoOBX0bEz3ntGBNvF5YkSTtUNYXJhdty4IjYFbgT2KU4zw2ZeWExL8psYF9gMfAhLxNJkiSoojDJzDu28dh/AN6TmS9ExBuAuyLih8AngEsyc3ZEfBOYCvikLmkbLV++nA9/+MOsWrWKiGDatGmcf/75nHnmmSxduhSANWvWMHjwYDo6OhqcVpJ6Vs3Mr89TuQsHYGfgDcC6zNyrp/0yM4EXipdvKL423c1zdtE+C7gICxNpmw0cOJCLL76YI488kueff54xY8Ywbtw4rr322s3bfPKTn2TvvfduYEpJqk41PSaDNi1HRAATgaOrOXhEDKByuebNwGXAY8CazFxfbPIEMLyXmSV10tLSQktLCwCDBg3isMMOY8WKFRx++OEAZCbXXXcdt912WyNjSlJVqhljslnRC/KdYsK16VVsvwFoi4jBwM3AyGrPFRHTgGlQ+cVrF7S0dU8++SQLFy5kt9122/wzs3jxYgYNGsS6dev63c9RM/73mrk+mi1zs+XdHtVcyvlgp5c7Ae3AS705SWauiYgFwDuAwRExsOg1GQGs6GafmcBMgPb29mxra+vNKaV+54UXXmDq1KlcdtllHHPMMZvb//3f/52pU6dSk5+h2V3++JZGl//NzZa55HnBzPXQn/4GVtNj8oFOy+uBx6lczulRRPwR8EpRlOwGjAO+CiwATqNyZ84UYE4vM0vawiuvvMKpp57KOeecwwc/+Oq/JdavX89NN93E4sWLG5hOkqpXzRiTc7fx2C3ArGKcyU7AdZn5vYh4CJgdEf8buI/Kk4slbaPMZOrUqRx22GF84hOfeM26H/3oR4wcOZIRI0Y0KJ0k9U63hUlEfKGH/TIzv9zTgTPzfuBPu2hfBhxVdUJJPfrJT37C1VdfzRFHHLG5u/crX/kK73//+5k9ezaTJ09ucEJJql5PPSbrumjbg8q8I/sCPRYmkurjmGOOoTIu/fWuvPLK+oaRpO3UbWGSmRdvWo6IQcD5wLlUxoZc3N1+kiRJ26rHMSYRMYTKTK3nUJkM7cjMfK4ewSRJUv/T0xiT/wN8kMotu0dk5gvdbStJkrQj7NTDuk8C+wN/DzwZEWuLr+cjYm194kmSpP6kpzEmPRUtknaw1unfb3SEHj0+Y3yjI0jqByw+JElSaViYSJKk0rAwkSRJpWFhIkmSSsPCRJIklYaFiSRJKg0LE0mSVBoWJpIkqTQsTCRJUmlYmEiSpNKwMJEkSaVhYSJJkkrDwkSSJJWGhYkkSSoNCxNJklQaFiaSJKk0LEwkSVJpWJhIktTPnHfeeQwdOpRRo0Ztbuvo6ODoo4+mra2N9vZ2Fi5c2JBsFiaSJPUzH/nIR7jlllte03bBBRdw4YUX0tHRwZe+9CUuuOCChmSrWWESEX8cEQsi4qGIeDAizi/ah0TErRHxSPF9n1plkCRJrzd27FiGDBnymraIYO3atQD87ne/Y//9929ENAbW8NjrgU9m5r0RMQhYHBG3Ah8B5mfmjIiYDkwHPlPDHJIkaSu+/vWvc+KJJ/KpT32KjRs38tOf/rQhOWrWY5KZKzPz3mL5eeBhYDgwEZhVbDYLOKVWGSRJUnUuv/xyLrnkEpYvX84ll1zC1KlTG5KjLmNMIqIV+FPgHmBYZq4sVj0FDKtHBkmS1L1Zs2bxwQ9+EIDTTz+9YYNfa3kpB4CI2BO4EfjbzFwbEZvXZWZGRHaz3zRgGkBLSwsdHR21jiqpB834M2jm+jBz7dUi75NPPslLL720+dj77rsvV1xxBe3t7dxzzz2MGDGiIe9TTQuTiHgDlaLkPzLzpqJ5VUS0ZObKiGgBVne1b2bOBGYCtLe3Z1tbWy2jSo03e0WjE/Soy59BM+9wr8tc8rxg5nrY0X8DJ0+ezO23384zzzzDhAkT+OIXv8hVV13F+eefz/r169l11125+uqrd/h5q1GzwiQqXSNXAA9n5j93WjUXmALMKL7PqVUGSZL0etdcc02X7YsXL65zkterZY/JO4EPAb+IiE19QX9HpSC5LiKmAr8GzqhhBkmS1ERqVphk5l1AdLP6+FqdV5IkNS9nfpUkSaVhYSJJkkqj5rcLS83mvPPO43vf+x5Dhw7lgQceeM26iy++mE996lM8/fTT7Lfffg1KKEnQOv37jY7Qo8dnjN+m/ewxkbbQ1cOtAJYvX868efM44IADGpBKkvoHCxNpC1093Arg4x//OF/72tfoPEmgJGnHsjCRqjBnzhyGDx/O6NGjGx1Fkvo0x5hIW/H73/+er3zlK8ybN6/RUSSpz7PHRNqKxx57jF/96leMHj2a1tZWnnjiCY488kieeuqpRkeTpD7HHhNpK4444ghWr371kU6tra0sWrTIu3IkqQbsMZG2MHnyZN7xjnewdOlSRowYwRVXXNHoSJLUb9hjIm2hu4dbbfL444/XJ4gk9UP2mEiSpNKwMJEkSaXRbwuT8847j6FDhzJq1KjNbddffz1vectb2GmnnVi0aFED0/Udvs+SpN7ot4VJV9OOjxo1iptuuomxY8c2KFXf4/ssSeqNfjv4dezYsa8bxHjYYYc1Jkwf1qj3ua8+3EqS+rp+22MiSZLKx8JEkiSVhoWJJEkqDQsTSZJUGv22MOlq2vGbb76ZESNGcPfddzN+/HhOPPHERsdser7PkqTe6Ld35XQ37fikSZPqnKRv832WJPVGv+0xkSRJ5WNhIkmSSsPCRJIklUbNxphExLeACcDqzBxVtA0BrgVagceBMzLzue05jzN81l7Z32PoG++zJKm2PSZXAidt0TYdmJ+ZhwDzi9eSJElADQuTzLwT+O0WzROBWcXyLOCUWp1fkiQ1n3qPMRmWmSuL5aeAYXU+vyRJKrGGzWOSmRkR2d36iJgGTANoaWmho6Ojbtl2pGbN3Wya7X1utrxg5noxc300W+ZmywvbnrnehcmqiGjJzJUR0QKs7m7DzJwJzARob2/Ptra2rjecvaIWOXeYbnM3k5K/x9DF+1zyzF1+Lsy8w/WJzCXPC2auhz7xWa5SvS/lzAWmFMtTgDl1Pr8kSSqxmhUmEXENcDdwaEQ8ERFTgRnAuIh4BHhv8VqSJAmo4aWczJzczarja3XO/qa1tZVBgwYxYMAABg4cyKJFixodSZKk7dJvH+LXVyxYsID99tuv0TEkSdohnJJekiSVhoVJE4sITjjhBMaMGcPMmTMbHUeSpO3mpZwmdtdddzF8+HBWr17NuHHjGDlyJGPHjm10LEmStpk9Jk1s+PDhAAwdOpRJkyaxcOHCBieSJGn7WJg0qXXr1vH8889vXp43bx6jRo1qcCpJkraPl3Ka1KpVq5g0aRIA69ev5+yzz+akk7Z8mLMkSc3FwqRJHXzwwSxZsqTRMSRJ2qG8lCNJkkrDwkSSJJWGhYkkSSoNCxNJklQaDn6ts9bp3290hK16fMb4RkeQJPVT9phIkqTSsDCRJEmlYWEiSZJKw8JEkiSVhoWJJEkqDQsTSZJUGhYmkiSpNCxMJElSaViYSJKk0rAwkSRJpWFhIkmSSsPCRJIklUZDCpOIOCkilkbEoxExvREZJElS+dS9MImIAcBlwPuAw4HJEXF4vXNIkqTyaUSPyVHAo5m5LDNfBmYDExuQQ5IklUwjCpPhwPJOr58o2iRJUj8XmVnfE0acBpyUmX9evP4Q8PbM/F9bbDcNmFa8PBRYWqeI+wHP1OlcO0qzZW62vGDmejFzfTRb5mbLC2bemgMz84+6WjGwTgE6WwH8cafXI4q218jMmcDMeoXaJCIWZWZ7vc+7PZotc7PlBTPXi5nro9kyN1teMPP2aMSlnJ8Dh0TEQRGxM3AWMLcBOSRJUsnUvcckM9dHxP8C/gsYAHwrMx+sdw5JklQ+jbiUQ2b+APhBI85dhbpfPtoBmi1zs+UFM9eLmeuj2TI3W14w8zar++BXSZKk7jglvSRJKo0+X5hExLciYnVEPNCpbXRE3B0Rv4iI70bEXp3WfbaYKn9pRJzYqb1u0+j3JnNE7BsRCyLihYi4dIvjjCm2fzQi/iUioiSZx0XE4qJ9cUS8pwkyHxURHcXXkoiY1GmfUn42Oq0/oPh8fKremXv5HrdGxIud3udvdtqnlJ+LYt1bi3UPFut3LXPmiDin03vcEREbI6Kt5JnfEBGzivaHI+KznfYp5c9fROwcEd8u2pdExHGd9qnL+xwRfxyVvw8PFZ/P84v2IRFxa0Q8Unzfp2iPIs+jEXF/RBzZ6VhTiu0fiYgptci7WWb26S9gLHAk8ECntp8D7yqWzwO+XCwfDiwBdgEOAh6jMkB3QLF8MLBzsc3hJcm8B3AM8JfApVscZyFwNBDAD4H3lSTznwL7F8ujgBVNkHl3YGCx3AKspjJGq7SfjU7rbwCuBz5VvK5b5l6+x62dt2uSz/JA4H5gdPF6X2BAmTNvsd8RwGNN8D6fDcwulncHHi8+L6X9+QP+Gvh2sTwUWAzsVM/3mcrvqiOL5UHAf1P5O/c1YHrRPh34arH8/iJPFPnuKdqHAMuK7/sUy/vU6n3u8z0mmXkn8Nstmv8EuLNYvhU4tVieSOXD/4fM/BXwKJUp9Os6jX5vMmfmusy8C3ip82yirkIAAAghSURBVMYR0QLslZk/y8on6yrglJJkvi8znyzaHwR2i4hdSp7595m5vmjfFdg0OKu0nw2AiDgF+BWV93mTumXubd6ulPlzAZwA3J+ZS4p9n83MDSXP3NlkKv//y/4+J7BHRAwEdgNeBtZS7p+/w4Hbiv1WA2uA9nq+z5m5MjPvLZafBx6mMtP6RGBWsdmsTuefCFyVFT8DBhd5TwRuzczfZuZzxX/nSbXIDP3gUk43HuTVD+/pvDrhW3fT5ZdhGv3uMndnOJWcm5Q186nAvZn5B0qeOSLeHhEPAr8A/rIoVEr72YiIPYHPAF/cYvtGZ+7pc3FQRNwXEXdExLFFW5k/F38CZET8V0TcGxEXFO1lztzZmcA1xXKZM98ArANWAr8B/ikzf0vjP8vQfeYlwMkRMTAiDgLGFOsa8j5HRCuV3up7gGGZubJY9RQwrFguxd/A/lqYnAf8z4hYTKV76+UG56lGn8scEW8Bvgp8tAHZutNt5sy8JzPfArwN+OymsQQl0F3mi4BLMvOFRgXrRnd5VwIHZOafAp8A/jO2GC/TQN1lHkjlUuo5xfdJEXF8YyK+ztZ+/t4O/D4zH+hq5wbpLvNRwAZgfyqX2T8ZEQc3JuLrdJf5W1T+gC8Cvg78lMp/Q90V/0i5EfjbzFzbeV3Ra1Oq23MbMo9Jo2XmL6l0wRIRfwKML1b1NF3+VqfRr6UeMndnBZWcm5Qqc0SMAG4GPpyZjxXNpc7caZuHI+IFivExlPez8XbgtIj4GjAY2BgRL1G51t2wzN3lLXrN/lAsL46Ix6j0SJT5c/EEcGdmPlOs+wGVMQj/j/Jm3uQsXu0tgXK/z2cDt2TmK8DqiPgJ0E7lX/Gl/PkrelQ/vmm7iPgplTEez1HH9zki3kClKPmPzLypaF4VES2ZubK4VLO6aO/u99kK4Lgt2m+vVeZ+2WMSEUOL7zsBfw9sGv0/FzirGO9wEHAIlUFKDZ9Gv4fMXSq66dZGxNHFiO8PA3NqHrST7jJHxGDg+1QGX/2kSTIfVFzfJiIOBEZSGYBX2s9GZh6bma2Z2UrlX2xfycxLG525h/f4jyJiQLF8MJWfv2Vl/lxQmcH6iIjYvfh8vAt4qOSZN7WdQTG+BMr980fl8s17inV7UBmY+UtK/PNXfCb2KJbHAeszs66fjeL4VwAPZ+Y/d1o1F9h0Z82UTuefC3w4Ko4Gflfk/S/ghIjYJyp38JxQtNVGrUbVluWLyr8IVgKvUPnXzVTgfCqV638DMygmmiu2/xyVUd5L6TRSmspo5f8u1n2uZJkfpzIg64Vi+8OL9nbggSLzpZ33aWRmKj+864COTl9DS575Q1SuJXcA9wKnNMNno9N+F1HclVPPzL18j0/d4j3+QKfjlPJzUWz/P4rcDwBfa5LMxwE/6+I4pcwM7EnlzrIHgYeAT9f7s7wNmVup/B15GPgRlafp1vV9pnJ5MancObbpd+37qdw9Nh94pMg2pNg+gMuKXL8A2jsd6zwqN4Q8Cpxby/fZmV8lSVJp9MtLOZIkqZwsTCRJUmlYmEiSpNKwMJEkSaVhYSJJkkrDwkTqZyJiQ1SeKvtARFwfEbtvZfvHI2K/LtoviuKJxRHxpYh47zZkaY2Iszu9bo+If+ntcao4z0dii6dvd7HNcRHxZzv63JJ6x8JE6n9ezMy2zBxFZfrsv9zeA2bmFzLzR9uwayuVWT03HWdRZv7N9ubZRscBFiZSg1mYSP3bj4E3F70F39vUGBGXRsRHOm13QUT8IiIWRsSbtzxIRFwZEacVy2+LiJ9GxJJi+0FFz8iPo/Kgu3s79UzMAI4tenA+3jlHRAyJiO9ExP0R8bOIeGvRflFEfCsibo+IZRHRZSETEedGxH9HxELgnZ3aPxAR90TlYYE/iohhUXnA2V8CHy+yHFvMRHtjRPy8+HpnV+eRtGP1y2flSIJiCvX3AbdUsfnvMvOIiPgwlentJ3RzzJ2Ba4EzM/PnUXkI34tUnsUxLjNfiohDqMyg2Q5MpzIj7YRi/+M6He6LwH2ZeUpEvIfK4+HbinUjgXdTeWja0oi4PCvPUdmUo6XYfwzwO2ABcF+x+i7g6MzMiPhz4ILM/GREfBN4ITP/qTjGf1J5COJdEXEAlSm4D6vivZK0HSxMpP5nt4joKJZ/TOVZGlu7hHFNp++X9LDdocDKzPw5QBZPMi2eGXJpRLRRecLqn1SR8xgq09STmbdFxL7x6tOGv5/FQ/8iYjWVx7Z3fpT824HbM/Pp4vzXdjrnCODaonjZGfhVN+d/L3B45XEjAOwVEXtm+Z7WLPUpFiZS//NiZrZ1boiI9bz20u6uW+yT3SxX6+PAKmB0cZ6XtuEYnf2h0/IGeve77F+Bf87MuUUPzUXdbLcTlZ6V7c0qqRccYyIJ4NdUegd2icrTn4/fYv2Znb7f3cNxlgItEfE2gGJ8yUBgbyo9KRupPBBxQLH981Qux3Tlx8A5xXGOA57Z1ANThXuAdxW9LG8ATu+0bm9efcz8lE7tW2aZB3xs04uit0dSjdljIonMXB4R11F54umveHU8xib7RMT9VHoqJvdwnJcj4kzgXyNiNyrjS94L/BtwYzFG5RYqT5eGylNPN0TEEuDKLc57EfCt4ry/57VFxNb+e1ZGxEVUiqg1VJ6q2vm410fEc8BtwEFF+3eBGyJiIpWC5G+Ay4rzDwTuZAfcwSSpZz5dWJIklYaXciRJUmlYmEiSpNKwMJEkSaVhYSJJkkrDwkSSJJWGhYkkSSoNCxNJklQaFiaSJKk0/j8V1l4+KK1j/AAAAABJRU5ErkJggg==\n","text/plain":["<Figure size 648x360 with 1 Axes>"]},"metadata":{"tags":[],"needs_background":"light"}}]},{"cell_type":"code","metadata":{"id":"QV7wcC1xnsbd"},"source":["#Save clean dataset\n","d.to_csv(\"scifi_metadata_htids.csv\")"],"execution_count":null,"outputs":[]}]}